clear
clear matrix
set matsize 11000, permanently
set maxvar 32767 
set matsize 800
{
local home 0
local officeGDF 0
local stefanoiMac 1
local stefanoJRC 0
if `officeGDF'{
global path = "C:\Users\lezgd\Dropbox\CareerMiur\analisi(career new)"
cd "$path"
}
if `home'{
global path = "C:\Users\gianni\Dropbox\CareerMiur\analisi(career new)"
cd "$path"
}
if `stefanoiMac' {
global path="/Users/stefanoverzillo/Dropbox/Career Miur/analisi(career new)"
cd "$path"
}
if `stefanoJRC' {
global path="C:\Users\verzist\Dropbox\Career Miur\analisi(career new)"
cd "$path"
}
}
********************************************************************************

use datacdv2, clear

*** we control for the relative importance of the rule of max 5 applications per candidate after the reform *****
*** it must be computed on yearly base - starred commands already executed in datacdv.do

local kA = 5 
local kAA = 12
local kAAA = 2000

gen ruleR=0 if k1<=`kA'&(fascia==1|ricass==1)
replace ruleR=1 if (`kA'<k1)&(k1<=`kAA')&(k1!=.)&(fascia==1|ricass==1)
replace ruleR=2 if (`kAA'<k1)&(k1<=`kAAA')&(k1!=.)&(fascia==1|ricass==1)
replace ruleR=4 if (`kAAA'<k1)&(k1!=.)&(fascia==1|ricass==1)

gen ruleA=0 if k1<=`kA'&(fascia==2|assord==1)
replace ruleA=1 if (`kA'<k1)&(k1<=`kAA')&(k1!=.)&(fascia==2|assord==1)
replace ruleA=2 if (`kAA'<k1)&(k1<=`kAAA')&(k1!=.)&(fascia==2|assord==1)
replace ruleA=4 if (`kAAA'<k1)&(k1!=.)&(fascia==2|assord==1)


sort ssd period
merge m:1 ssd period using Mass2
tab ricass if _merge==1
drop _merge
sort ssd period
merge m:1 ssd period using Mord2
tab assord if _merge==1
drop _merge

**** generate individual productivity as reference against full professors **************************************
***** create the productivity of each fascia for anyone, given ssd/period - new products, not cumulated ones ****
replace output1e=0 if output1e==.
replace output1b=0 if output1b==.
replace rH=0 if rH==.
replace H=0 if H==.

* output1e
egen tmp1e=mean(soutput1e) if fascia==1, by(ssdcode period)
egen prod1e=max(tmp1e), by(ssdcode period)
egen tmp2e=mean(soutput1e) if fascia==2, by(ssdcode period)
egen prod2e=max(tmp2e), by(ssdcode period)
egen tmp3e=mean(soutput1e) if fascia==3, by(ssdcode period)
egen prod3e=max(tmp3e), by(ssdcode period)
drop tmp*
* output1b
egen tmp1b=mean(soutput1b) if fascia==1, by(ssdcode period)
egen prod1b=max(tmp1b), by(ssdcode period)
egen tmp2b=mean(soutput1b) if fascia==2, by(ssdcode period)
egen prod2b=max(tmp2b), by(ssdcode period)
egen tmp3b=mean(soutput1b) if fascia==3, by(ssdcode period)
egen prod3b=max(tmp3b), by(ssdcode period)
drop tmp*

*** sd of full professors output ******
* output1e
egen tmp3e=sd(soutput1e) if fascia==3, by(ssdcode period)
egen sd3e=max(tmp3e), by(ssdcode period)
drop tmp*
* output1b
egen tmp3b=sd(soutput1b) if fascia==3, by(ssdcode period)
egen sd3b=max(tmp3b), by(ssdcode period)
drop tmp*

**** Numbers of the three groups  ****
egen tmp1e=count(soutput1e) if fascia==1, by(ssdcode period)
egen num1e=max(tmp1e), by(ssdcode period)
egen tmp2e=count(soutput1e) if fascia==2, by(ssdcode period)
egen num2e=max(tmp2e), by(ssdcode period)
egen tmp3e=count(soutput1e) if fascia==3, by(ssdcode period)
egen num3e=max(tmp3e), by(ssdcode period)
gen  pyramidord = num2e/num3e
gen  pyramidass = num1e/num3e
drop tmp*
drop num3* num2* num1* 

* standardisation according to the deviation from full professors' productivity
g ioutput1e=soutput1e/(1+prod3e)
label var ioutput1e "outptu1e relative to full professors per period/ssd productivity"
g ioutput1b=soutput1b/(1+prod3b)
label var ioutput1b "outptu1b relative to full professors per period/ssd productivity"
sum output1e output1b ioutput1e ioutput1b

* these are R&A average productivity by ssd
g syAoutput1e=prod2e/(1+prod3e)
g syRoutput1e=prod1e/(1+prod3e)
g syAoutput1b=prod2b/(1+prod3b)
g syRoutput1b=prod1b/(1+prod3b)
sum syAoutput* syRoutput*
corr syAoutput* syRoutput*

* regression can be performed at yearly base (while M and g are computed at period frequency) or at period frequency
* we proceed at period frequency - however for meritocracy/competition indices there is the problem of 3204 professor changing 
* ssd within the period - I set as relevant values those prevailing at the beginning of each period

sort id_prof anno
gen ssdchange=(ssd!=ssd[_n-1]&id_prof==id_prof[_n-1])
egen tmp=max(ssdchange), by(id_prof period)
qui tab Kord
egen f1Kord=mean(Kord) if tmp==0, by(id_prof period)
qui tab f1Kord
egen f2Kord=mean(Kord) , by(id_prof period)
qui tab f2Kord

sort id_prof anno
egen t1=min(anno), by(id_prof period)
g t2=1 if anno==t1
g ttMassE=mMassE*t2
g ttMassB=mMassB*t2
g ttMordE=mMordE*t2
g ttMordB=mMordB*t2
g ttKass=Kass*t2
g ttNass=Nass*t2
g ttKord=Kord*t2
g ttNord=Nord*t2

egen ffMassE=max(ttMassE), by(id_prof period)
egen ffMassB=max(ttMassB), by(id_prof period)
egen ffMordE=max(ttMordE), by(id_prof period)
egen ffMordB=max(ttMordB), by(id_prof period)
egen ffKass=max(ttKass), by(id_prof period)
egen ffNass=max(ttNass), by(id_prof period)
egen ffKordE=max(ttKord), by(id_prof period)
egen ffNord=max(ttNord), by(id_prof period)

replace mMassE=ffMassE if tmp==1
replace mMassB=ffMassB if tmp==1
replace mMordE=ffMordE if tmp==1
replace mMordB=ffMordB if tmp==1
replace Kass=ffKass if tmp==1
replace Nass=ffNass if tmp==1
replace Kord=ffKord if tmp==1
replace Nord=ffNord if tmp==1

egen f3Kord=mean(Kord) , by(id_prof period)
qui tab f3Kord

drop f1Kord f2Kord f3Kord tmp tt* ff*

*******************************************************************************************************************
* creation of relevant dataset
*******************************************************************************************************************

gen biblio=(areacun==1|areacun==2|areacun==3|areacun==4|areacun==5|areacun==6|areacun==7|areacun==9|ssd=="ICAR/01"|ssd=="ICAR/02"| ///
ssd=="ICAR/03"|ssd=="ICAR/04"|ssd=="ICAR/05"|ssd=="ICAR/06"|ssd=="ICAR/07"|ssd=="ICAR/08"|ssd=="ICAR/09"|ssd=="ICAR/22"| ///
ssd=="M-PSI/01"|ssd=="M-PSI/02"|ssd=="M-PSI/03"|ssd=="M-PSI/04"|ssd=="M-PSI/05"|ssd=="M-PSI/06"|ssd=="M-PSI/07"|ssd=="M-PSI/08")

sum *final

qui tab id_uni region
drop regionefinal

bysort id_prof period: gen a=_n
egen b=max(a), by (id_prof period)
gen regionfinal=.
foreach var in ssdcode id_uni areacun region submacro {
sort id_prof anno
replace `var'final=`var' if a==b
} 
drop a b
sum *final

*** variables sd3e  sd3b pyramidord pyramidass in last row of (mean) added by gdf 26/4

tab fascia, g(fasc)
collapse (mean) age female avage shfemale av1age av2age av3age sd3age sh1female sh2female sh3female homo* ///
k1 n1 k2 n2 gR gA ruleR ruleA reform output1b output1e ioutput1b ioutput1e H rH syA* syR* fasc1-fasc3  ///
id_unifinal regionfinal ssdcodefinal areacunfinal submacrofinal quartile1e quartile1b decile1e decile1b ///
mMordE sdMordE mMordB sdMordB Nord Kord concorsoord mMassE sdMassE mMassB sdMassB Nass Kass concorsoass extass extord biblio ///
sd3e  sd3b pyramidord pyramidass ///
(max) ricord ricass assord  , by(id_prof period)
label value id_unifinal ateneo
label value ssdcodefinal ssdcode
label value regionfinal region
label value areacunfinal areacun
label value submacrofinal submacro
gen subject=area
recode subject 2=1 3=1 4=1 5=1 6=2 7=1 8=3 9=3 10=4 11=4 12=4 13=5 14=5 99=.
label value subject subject 
label valu period period

label define areacun 1 "mathematics and computer science" 2 "physics" 3 "chemistry" 4 "earth science" 5 "biology" ///
6 "medicine" 7 "agriculture and veterinary sciences" 8 "engineering and architecture(biblio)" 15 "architecture (not biblio)" 9 "industrial engineering and ICT" ///
10 "humanities" 11 "history, philosophy" 16 "psichology" 12 "law" 13 "economics and statistics" 14 "sociology and political science" 99 "missing", modify
label value areacun areacun


gen KNass=Kass/Nass
gen KNord=Kord/Nord
label var KNass "degree of competitiveness - associate professor"
label var KNord "degree of competitiveness - full professor"
label var av1age "average age by ssd/period - assistant professor"
label var sh1female "female share by ssd/period - assistant professor"
label var av2age "average age by ssd/period - associate professor"
label var sh2female "female share by ssd/period - associate professor"
label var av3age "average age by ssd/period - full professor"
label var sh3female "female share by ssd/period - full professor"
label var sd3age "sd.dev age by ssd/period - full professor"
label var sd3e "sd.dev output (type e) by ssd/period - full professor"
label var sd3b "sd.dev output (type b) by ssd/period - full professor"
label var pyramidord "number of associate per full prof by ssd/period - full professor"
label var pyramidass "number of assistant per full prof by ssd/period - full professor"
label var mMassE "meritocracy of competition for associate based on H+cumulated output1e"
label var mMassB "meritocracy of competition for associate based on H+cumulated output1b"
label var mMordE "meritocracy of competition for full based on H+cumulated output1e"
label var mMordB "meritocracy of competition for full based on H+cumulated output1b"
label var quartile1e "quartile of effort by ssd/period/fascia on within-period cumulated output1e"
label var quartile1b "quartile of effort by ssd/period/fascia on within-period cumulated output1b"
label var ruleR "0=k<5 - 1=5<k<10 - 2=k<10 for associate - yearly based"
label var ruleA "0=k<5 - 1=5<k<10 - 2=k<10 for full - yearly based"
label var ioutput1e "outptu1e relative to full professors per period/ssd productivity"
label var ioutput1b "outptu1b relative to full professors per period/ssd productivity"
label var H "H index - citations estimated backward"
label var rH "real H index - citations estimated backward"
label var extass "fraction of newass on ricass"
label var extord "fraction of neword on assord+ricord"
label var biblio "whether RAE considers it as bibliometric sector"
label var homo1 "same name+initial in Italy/year"
label var homo2 "same name+initial in university/year"
label var homo3 "same name+initial in subject/university/Italy/year"

xtset id_prof period
compress
save regression10, replace
